Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese"),
Religion=relevel(Religion,ref="None")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |>
mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
`Primary Language` = as.factor(`Primary Language`))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
qol <- qol |> mutate(across(c(Age,`Duration of Residency`,`Education Completed`),~scale(.x)))We will be using the Prediction Set to run an analysis of deviance to check model performance.
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)| Characteristic | N = 1,7211 |
|---|---|
| Unmet.Health.Need | |
| 0 | 1,548 (90%) |
| Yes | 173 (10%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Unmet.Health.Need=="Yes")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
set.seed(123321)
VSURF(Unmet.Health.Need~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 19.2 secs
VSURF selected:
27 variables at thresholding step (in 9.1 secs)
1 variables at interpretation step (in 9.5 secs)
1 variables at prediction step (in 0.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 7 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "English.Speaking"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "English.Speaking"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.09904126
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 English.Speaking 0.01433 0.00089 black
2 Age 0.01065 0.00060 black
3 Duration.of.Residency 0.01004 0.00040 black
4 Ethnicity 0.00785 0.00062 red
5 Discrimination 0.00657 0.00025 black
6 Dental.Insurance 0.00616 0.00045 black
7 Religion 0.00574 0.00058 black
8 English.Difficulties 0.00492 0.00044 black
9 Marital.Status 0.00479 0.00038 black
10 Health.Insurance 0.00338 0.00047 black
11 Full.Time.Employment 0.00283 0.00025 black
12 Income_median 0.00228 0.00038 black
13 Identify.Ethnically 0.00227 0.00028 black
14 US.Born 0.00209 0.00018 black
15 Familiarity.with.America 0.00191 0.00032 black
16 Primary.Language 0.00182 0.00020 black
17 Belonging 0.00179 0.00029 black
18 Student 0.00170 0.00019 black
19 Familiarity.with.Ethnic.Origin 0.00068 0.00030 black
20 Retired 0.00066 0.00013 black
21 Gender 0.00054 0.00019 black
22 Homemaker 0.00013 0.00014 black
23 Part.Time.Employment 0.00011 0.00017 black
24 Self.Employed.Full.Time 0.00000 0.00000 black
25 Self.Employed.Part.Time 0.00000 0.00000 black
26 Disabled 0.00000 0.00000 black
27 Unemployed 0.00000 0.00000 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.pred]
pred_vars <- if("Ethnicity" %in% pred_vars){
pred_vars
} else {
c(pred_vars,"Ethnicity")
}
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")
options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Health.Need ~ English.Speaking
Model 2: Unmet.Health.Need ~ English.Speaking + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1717 1058.6
2 1712 1042.0 5 16.556 0.005423 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1109.088 1088.391
car::Anova(mod1, test.statistic = "F")Analysis of Deviance Table (Type II tests)
Response: Unmet.Health.Need
Error estimate based on Pearson residuals
Sum Sq Df F value Pr(>F)
English.Speaking 45.32 3 14.8808 1.449e-09 ***
Ethnicity 16.56 5 3.2615 0.00618 **
Residuals 1738.14 1712
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.03556 0.12173 -16.722 < 2e-16 ***
English.Speaking1 1.01133 0.20902 4.839 1.31e-06 ***
English.Speaking2 0.32016 0.13649 2.346 0.019 *
English.Speaking3 -0.84052 0.16431 -5.116 3.13e-07 ***
Ethnicity1 -0.21044 0.18711 -1.125 0.261
Ethnicity2 -0.34833 0.22688 -1.535 0.125
Ethnicity3 0.01486 0.28599 0.052 0.959
Ethnicity4 0.29352 0.18301 1.604 0.109
Ethnicity5 -0.29265 0.39474 -0.741 0.458
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1122.9 on 1720 degrees of freedom
Residual deviance: 1042.0 on 1712 degrees of freedom
AIC: 1060
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.0957 0.0148 Inf 0.0704 0.129
Asian Indian 0.0844 0.0192 Inf 0.0536 0.130
Filipino 0.1170 0.0339 Inf 0.0652 0.201
Korean 0.1491 0.0227 Inf 0.1098 0.199
Other 0.0888 0.0379 Inf 0.0375 0.196
Vietnamese 0.1835 0.0242 Inf 0.1407 0.236
Results are averaged over the levels of: English.Speaking
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") |> summary(infer=T) contrast estimate SE df asymp.LCL asymp.UCL z.ratio p.value
Chinese effect -0.2104 0.187 Inf -0.7041 0.283 -1.125 0.3911
Asian Indian effect -0.3483 0.227 Inf -0.9469 0.250 -1.535 0.2494
Filipino effect 0.0149 0.286 Inf -0.7397 0.769 0.052 0.9586
Korean effect 0.2935 0.183 Inf -0.1893 0.776 1.604 0.2494
Other effect -0.2927 0.395 Inf -1.3341 0.749 -0.741 0.5502
Vietnamese effect 0.5431 0.172 Inf 0.0901 0.996 3.163 0.0094
Results are averaged over the levels of: English.Speaking
Results are given on the log odds ratio (not the response) scale.
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
P value adjustment: fdr method for 6 tests
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)| Characteristic | N = 1,7171 |
|---|---|
| Unmet.Dental.Needs | |
| 0 | 1,528 (89%) |
| Yes | 189 (11%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Unmet.Dental.Needs=="Yes")|> nrow()
set.seed(123321)
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
VSURF(Unmet.Dental.Needs~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 22.9 secs
VSURF selected:
26 variables at thresholding step (in 11.6 secs)
1 variables at interpretation step (in 10.7 secs)
1 variables at prediction step (in 0.6 secs)
VSURF ran in parallel on a PSOCK cluster and used 7 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Dental.Insurance"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Dental.Insurance"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.1097263
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Dental.Insurance 0.01403 0.00057 black
2 Age 0.01082 0.00070 black
3 English.Speaking 0.01023 0.00057 black
4 Religion 0.00785 0.00056 black
5 Duration.of.Residency 0.00687 0.00052 black
6 Marital.Status 0.00548 0.00027 black
7 Income_median 0.00543 0.00036 black
8 Ethnicity 0.00411 0.00038 red
9 Familiarity.with.America 0.00385 0.00035 black
10 English.Difficulties 0.00249 0.00038 black
11 Primary.Language 0.00240 0.00031 black
12 Full.Time.Employment 0.00202 0.00030 black
13 Health.Insurance 0.00170 0.00044 black
14 US.Born 0.00170 0.00025 black
15 Retired 0.00152 0.00024 black
16 Discrimination 0.00125 0.00028 black
17 Student 0.00110 0.00021 black
18 Familiarity.with.Ethnic.Origin 0.00095 0.00033 black
19 Belonging 0.00091 0.00037 black
20 Identify.Ethnically 0.00066 0.00034 black
21 Homemaker 0.00032 0.00013 black
22 Gender 0.00025 0.00022 black
23 Self.Employed.Full.Time 0.00000 0.00000 black
24 Self.Employed.Part.Time 0.00000 0.00000 black
25 Disabled 0.00000 0.00000 black
26 Unemployed 0.00000 0.00000 black
27 Part.Time.Employment -0.00034 0.00015 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")Dental Insurance is the only variable in the Prediction Set, which means Ethnicity might not be a significant predictor of unmet dental needs.
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1715 1097.7
2 1710 1091.7 5 5.946 0.3115
performance::check_model(mod1)data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1143.842 1112.546
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.14863 0.10164 -21.139 <2e-16 ***
Dental.Insurance1 0.74942 0.08515 8.801 <2e-16 ***
Ethnicity1 -0.09192 0.16793 -0.547 0.584
Ethnicity2 -0.17522 0.18386 -0.953 0.341
Ethnicity3 0.03226 0.25564 0.126 0.900
Ethnicity4 0.22752 0.16628 1.368 0.171
Ethnicity5 -0.27426 0.34240 -0.801 0.423
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1190.5 on 1716 degrees of freedom
Residual deviance: 1091.7 on 1710 degrees of freedom
AIC: 1105.7
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.0962 0.0146 Inf 0.0712 0.129
Asian Indian 0.0892 0.0154 Inf 0.0632 0.124
Filipino 0.1075 0.0274 Inf 0.0644 0.174
Korean 0.1277 0.0185 Inf 0.0957 0.169
Other 0.0814 0.0300 Inf 0.0388 0.163
Vietnamese 0.1339 0.0196 Inf 0.0998 0.177
Results are averaged over the levels of: Dental.Insurance
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")|> summary(infer=T) contrast estimate SE df asymp.LCL asymp.UCL z.ratio p.value
Chinese effect -0.0919 0.168 Inf -0.535 0.351 -0.547 0.7010
Asian Indian effect -0.1752 0.184 Inf -0.660 0.310 -0.953 0.6347
Filipino effect 0.0323 0.256 Inf -0.642 0.707 0.126 0.8996
Korean effect 0.2275 0.166 Inf -0.211 0.666 1.368 0.5136
Other effect -0.2743 0.342 Inf -1.178 0.629 -0.801 0.6347
Vietnamese effect 0.2816 0.169 Inf -0.165 0.729 1.662 0.5136
Results are averaged over the levels of: Dental.Insurance
Results are given on the log odds ratio (not the response) scale.
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
P value adjustment: fdr method for 6 tests
The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.
rfdata <- qol |>
select(`Physical Check-up`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)| Characteristic | N = 1,7141 |
|---|---|
| Physical.Check.up | |
| 0 | 524 (31%) |
| Yes | 1,190 (69%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Physical.Check.up=="0")|> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
set.seed(123321)
set.seed(123321)
VSURF(Physical.Check.up~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 39.5 secs
VSURF selected:
27 variables at thresholding step (in 20.8 secs)
6 variables at interpretation step (in 14.6 secs)
4 variables at prediction step (in 4.1 secs)
VSURF ran in parallel on a PSOCK cluster and used 7 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Health.Insurance" "Dental.Insurance"
[4] "Income_median"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Age" "Health.Insurance"
[4] "Dental.Insurance" "Income_median" "Student"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2763711
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.03324 0.00087 black
2 Age 0.02493 0.00065 black
3 Health.Insurance 0.01433 0.00061 black
4 Dental.Insurance 0.01274 0.00067 black
5 Income_median 0.00655 0.00042 black
6 Student 0.00646 0.00062 black
7 Ethnicity 0.00614 0.00062 red
8 EnglishSpeak 0.00610 0.00059 black
9 Marital.Status 0.00593 0.00034 black
10 Religion 0.00569 0.00082 black
11 EnglishDiff 0.00477 0.00047 black
12 Retired 0.00344 0.00034 black
13 Employment 0.00344 0.00042 black
14 Familiarity.with.America 0.00254 0.00037 black
15 Gender 0.00225 0.00048 black
16 Familiarity.with.Ethnic.Origin 0.00223 0.00053 black
17 Discrimination 0.00117 0.00038 black
18 Primary.Language 0.00115 0.00032 black
19 US.Born 0.00086 0.00018 black
20 Identify.Ethnically 0.00077 0.00042 black
21 Part.Time.Employment 0.00058 0.00025 black
22 Homemaker 0.00024 0.00016 black
23 Belonging 0.00008 0.00030 black
24 Self.Employed.Full.Time 0.00000 0.00000 black
25 Self.Employed.Part.Time 0.00000 0.00000 black
26 Disabled 0.00000 0.00000 black
27 Unemployed 0.00000 0.00000 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Physical.Check.up ~ Duration.of.Residency + Health.Insurance +
Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Health.Insurance +
Dental.Insurance + Income_median + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1709 1855.7
2 1704 1826.1 5 29.563 1.797e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1900.562 1892.892
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.52527 0.09241 5.684 1.32e-08 ***
Duration.of.Residency 0.66004 0.07135 9.251 < 2e-16 ***
Health.Insurance1 -0.49501 0.08975 -5.515 3.48e-08 ***
Dental.Insurance1 -0.21240 0.06805 -3.121 0.001801 **
Income_median1 -0.22176 0.06542 -3.390 0.000699 ***
Ethnicity1 0.02811 0.11660 0.241 0.809513
Ethnicity2 0.06514 0.12321 0.529 0.596980
Ethnicity3 0.50145 0.18332 2.735 0.006230 **
Ethnicity4 -0.44553 0.12263 -3.633 0.000280 ***
Ethnicity5 -0.50148 0.20344 -2.465 0.013700 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2110.4 on 1713 degrees of freedom
Residual deviance: 1826.1 on 1704 degrees of freedom
AIC: 1846.1
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.633 0.0309 Inf 0.571 0.691
Asian Indian 0.642 0.0325 Inf 0.576 0.702
Filipino 0.735 0.0423 Inf 0.644 0.809
Korean 0.518 0.0350 Inf 0.449 0.586
Other 0.504 0.0607 Inf 0.387 0.620
Vietnamese 0.705 0.0333 Inf 0.635 0.765
Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")|> summary(infer=T) contrast estimate SE df asymp.LCL asymp.UCL z.ratio p.value
Chinese effect 0.0281 0.117 Inf -0.2795 0.3357 0.241 0.8095
Asian Indian effect 0.0651 0.123 Inf -0.2599 0.3902 0.529 0.7164
Filipino effect 0.5015 0.183 Inf 0.0178 0.9851 2.735 0.0187
Korean effect -0.4455 0.123 Inf -0.7691 -0.1220 -3.633 0.0017
Other effect -0.5015 0.203 Inf -1.0382 0.0352 -2.465 0.0206
Vietnamese effect 0.3523 0.138 Inf -0.0114 0.7160 2.555 0.0206
Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median
Results are given on the log odds ratio (not the response) scale.
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
P value adjustment: fdr method for 6 tests
rfdata <- qol |>
select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)| Characteristic | N = 1,7091 |
|---|---|
| Dentist.Check.up | |
| 0 | 689 (40%) |
| Yes | 1,020 (60%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Dentist.Check.up=="0") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
set.seed(123321)
VSURF(Dentist.Check.up~.,
rfdata,
na.action="na.omit",
parallel=T,verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 42.3 secs
VSURF selected:
27 variables at thresholding step (in 22.1 secs)
3 variables at interpretation step (in 17.9 secs)
3 variables at prediction step (in 2.3 secs)
VSURF ran in parallel on a PSOCK cluster and used 7 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Duration.of.Residency" "Dental.Insurance" "Religion"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Duration.of.Residency" "Dental.Insurance" "Religion"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.2855471
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Duration.of.Residency 0.05253 0.00115 black
2 Dental.Insurance 0.04744 0.00074 black
3 Religion 0.01716 0.00079 black
4 Age 0.01390 0.00068 black
5 Ethnicity 0.01241 0.00062 red
6 Health.Insurance 0.00908 0.00038 black
7 EnglishSpeak 0.00735 0.00072 black
8 Income_median 0.00709 0.00038 black
9 Identify.Ethnically 0.00359 0.00040 black
10 EnglishDiff 0.00348 0.00045 black
11 Gender 0.00317 0.00044 black
12 Marital.Status 0.00296 0.00039 black
13 Belonging 0.00222 0.00050 black
14 Familiarity.with.America 0.00219 0.00038 black
15 Familiarity.with.Ethnic.Origin 0.00211 0.00038 black
16 Student 0.00207 0.00027 black
17 Primary.Language 0.00173 0.00034 black
18 Employment 0.00145 0.00031 black
19 Discrimination 0.00074 0.00027 black
20 US.Born 0.00070 0.00021 black
21 Homemaker 0.00014 0.00022 black
22 Part.Time.Employment 0.00008 0.00017 black
23 Retired 0.00005 0.00017 black
24 Self.Employed.Full.Time 0.00000 0.00000 black
25 Self.Employed.Part.Time 0.00000 0.00000 black
26 Disabled 0.00000 0.00000 black
27 Unemployed 0.00000 0.00000 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Dentist.Check.up ~ Duration.of.Residency + Dental.Insurance +
Religion
Model 2: Dentist.Check.up ~ Duration.of.Residency + Dental.Insurance +
Religion + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1700 1897.6
2 1695 1886.3 5 11.237 0.04688 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1990.564 1964.582
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.241988 0.101446 2.385 0.0171 *
Duration.of.Residency 0.663596 0.067581 9.819 <2e-16 ***
Dental.Insurance1 -0.794684 0.059367 -13.386 <2e-16 ***
Religion1 0.031546 0.178166 0.177 0.8595
Religion2 0.375181 0.194618 1.928 0.0539 .
Religion3 0.185876 0.179582 1.035 0.3006
Religion4 -0.275420 0.257288 -1.070 0.2844
Religion5 -0.302753 0.332627 -0.910 0.3627
Religion6 -0.006214 0.391627 -0.016 0.9873
Ethnicity1 0.358734 0.145966 2.458 0.0140 *
Ethnicity2 -0.173972 0.260818 -0.667 0.5048
Ethnicity3 0.257446 0.192288 1.339 0.1806
Ethnicity4 -0.035071 0.150877 -0.232 0.8162
Ethnicity5 -0.225744 0.214870 -1.051 0.2934
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2304.7 on 1708 degrees of freedom
Residual deviance: 1886.4 on 1695 degrees of freedom
AIC: 1914.4
Number of Fisher Scoring iterations: 4
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.644 0.0411 Inf 0.560 0.720
Asian Indian 0.515 0.0602 Inf 0.399 0.630
Filipino 0.621 0.0557 Inf 0.507 0.723
Korean 0.550 0.0469 Inf 0.457 0.639
Other 0.503 0.0634 Inf 0.381 0.624
Vietnamese 0.514 0.0454 Inf 0.425 0.601
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.432 0.209 Inf 0.974 2.10 1 2.458
Asian Indian effect 0.840 0.219 Inf 0.422 1.67 1 -0.667
Filipino effect 1.294 0.249 Inf 0.779 2.15 1 1.339
Korean effect 0.966 0.146 Inf 0.648 1.44 1 -0.232
Other effect 0.798 0.171 Inf 0.453 1.41 1 -1.051
Vietnamese effect 0.834 0.125 Inf 0.561 1.24 1 -1.206
p.value
0.0839
0.6057
0.4402
0.8162
0.4402
0.4402
Results are averaged over the levels of: Dental.Insurance, Religion
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired, Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |>
na.omit() |>
as.data.frame() |>
rename_with(make.names)rfdata |> gtsummary::tbl_summary(include=Folkmedicine)| Characteristic | N = 1,6951 |
|---|---|
| Folkmedicine | |
| 0 | 1,476 (87%) |
| Yes | 219 (13%) |
| 1 n (%) | |
n_minority <- rfdata |> filter(Folkmedicine=="Yes") |> nrow()
rf_options_for_vsurf <- list(
sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
strata = rfdata[,1], # Stratify by the response variable
importance = TRUE # Ensure importance is calculated
)
set.seed(123321)
VSURF(Folkmedicine~.,
rfdata,
na.action="na.omit",
parallel=T,
verbose=F,
rf.options=rf_options_for_vsurf)->vsurf.modWarning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
vsurf.mod |> summary()
VSURF computation time: 27.6 secs
VSURF selected:
22 variables at thresholding step (in 16.6 secs)
2 variables at interpretation step (in 10 secs)
2 variables at prediction step (in 1 secs)
VSURF ran in parallel on a PSOCK cluster and used 7 cores
names(rfdata[,-1])[vsurf.mod$varselect.pred][1] "Age" "Ethnicity"
names(rfdata[,-1])[vsurf.mod$varselect.interp][1] "Age" "Ethnicity"
plot(vsurf.mod)vsurf.mod$mean.perf[1] 0.130354
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
Importance = vsurf.mod$imp.mean.dec,
sd_Importance = vsurf.mod$imp.sd.dec
)|>
mutate(fill = case_when(Variable=="Ethnicity"~"red",
.default="black"))
vi |> mutate(across(Importance:sd_Importance,~round(.x,5))) Variable Importance sd_Importance fill
1 Age 0.02416 0.00084 black
2 Ethnicity 0.01459 0.00064 red
3 Duration.of.Residency 0.01291 0.00083 black
4 English.Speaking 0.00798 0.00075 black
5 Religion 0.00553 0.00045 black
6 Income_median 0.00545 0.00035 black
7 Student 0.00460 0.00028 black
8 English.Difficulties 0.00425 0.00061 black
9 Marital.Status 0.00382 0.00032 black
10 Retired 0.00329 0.00047 black
11 Familiarity.with.America 0.00286 0.00040 black
12 Primary.Language 0.00277 0.00036 black
13 Familiarity.with.Ethnic.Origin 0.00238 0.00031 black
14 Full.Time.Employment 0.00184 0.00031 black
15 Discrimination 0.00139 0.00022 black
16 Homemaker 0.00123 0.00033 black
17 Identify.Ethnically 0.00118 0.00039 black
18 US.Born 0.00104 0.00013 black
19 Dental.Insurance 0.00097 0.00025 black
20 Health.Insurance 0.00067 0.00016 black
21 Belonging 0.00059 0.00036 black
22 Gender 0.00034 0.00020 black
23 Part.Time.Employment 0.00002 0.00014 black
24 Self.Employed.Full.Time 0.00000 0.00000 black
25 Self.Employed.Part.Time 0.00000 0.00000 black
26 Disabled 0.00000 0.00000 black
27 Unemployed 0.00000 0.00000 black
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
geom_bar(stat = "identity",alpha=0.4) +
geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_manual(values=c("black","red"),
guide="none")
plot(importance_plot)ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred])
mod_form <- reformulate(pred_vars, response="Folkmedicine")
mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)Analysis of Deviance Table
Model 1: Folkmedicine ~ Age
Model 2: Folkmedicine ~ Age + Ethnicity
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 1693 1263.1
2 1688 1211.6 5 51.507 6.808e-10 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
performance::check_model(mod1)Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
Some of the variables were in matrix-format - probably you used
`scale()` on your data?
If so, and you get an error, please try `datawizard::standardize()` to
standardize your data.
data.frame(BIC_ethnicity = BIC(mod1),
BIC_noethnicity=BIC(mod2)) BIC_ethnicity BIC_noethnicity
1 1263.66 1277.99
summary(mod1)
Call:
glm(formula = mod_form, family = "binomial", data = rfdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.11990 0.09787 -21.660 < 2e-16 ***
Age 0.46420 0.07478 6.207 5.39e-10 ***
Ethnicity1 0.53776 0.14362 3.744 0.000181 ***
Ethnicity2 -0.32424 0.18668 -1.737 0.082415 .
Ethnicity3 -0.22481 0.24241 -0.927 0.353716
Ethnicity4 0.81181 0.14791 5.488 4.05e-08 ***
Ethnicity5 -0.22784 0.31906 -0.714 0.475154
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1304.7 on 1694 degrees of freedom
Residual deviance: 1211.6 on 1688 degrees of freedom
AIC: 1225.6
Number of Fisher Scoring iterations: 5
mod1 |> emmeans::emmeans(~Ethnicity, type="response") Ethnicity prob SE df asymp.LCL asymp.UCL
Chinese 0.1646 0.0181 Inf 0.1320 0.2032
Asian Indian 0.0768 0.0139 Inf 0.0536 0.1089
Filipino 0.0841 0.0210 Inf 0.0511 0.1355
Korean 0.2058 0.0228 Inf 0.1647 0.2540
Other 0.0839 0.0286 Inf 0.0423 0.1598
Vietnamese 0.0609 0.0129 Inf 0.0400 0.0917
Confidence level used: 0.95
Intervals are back-transformed from the logit scale
mod1 |> emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T) contrast odds.ratio SE df asymp.LCL asymp.UCL null z.ratio
Chinese effect 1.712 0.246 Inf 1.172 2.501 1 3.744
Asian Indian effect 0.723 0.135 Inf 0.442 1.183 1 -1.737
Filipino effect 0.799 0.194 Inf 0.421 1.514 1 -0.927
Korean effect 2.252 0.333 Inf 1.524 3.327 1 5.488
Other effect 0.796 0.254 Inf 0.343 1.848 1 -0.714
Vietnamese effect 0.564 0.116 Inf 0.327 0.972 1 -2.776
p.value
0.0005
0.1236
0.4245
<.0001
0.4752
0.0110
Confidence level used: 0.95
Conf-level adjustment: bonferroni method for 6 estimates
Intervals are back-transformed from the log odds ratio scale
P value adjustment: fdr method for 6 tests
Tests are performed on the log odds ratio scale